%matplotlib inline
import pandas as pd
from tools import ready_data
# read csv file and reoder data as input files, voter1, voter2, ...
file1 = 'results-3AFT28WXLF2Y43L0GI7MY3EYIU1IO5.csv'
file2 = 'results-3P4ZBJFX2V3MZLSC6WSBC0ES548FWN.csv'
## Correlation p-values for Pearson's r
from scipy.stats import pearsonr
## Adapted from: https://stackoverflow.com/questions/25571882/pandas-columns-correlation-with-statistical-significance/45507587#45507587
def corr_pvalues(df):
# df = df.dropna()._get_numeric_data()
dfcols = pd.DataFrame(columns=df.columns)
pvalues = dfcols.transpose().join(dfcols, how='outer')
for r in df.columns:
for c in df.columns:
pvalues[r][c] = pearsonr(df[r], df[c])[1]
return pvalues
df1, df2, name1, header1 = ready_data(file1)
print("The boxplot of sketch %s"%name1)
# drop the last 7 rows, since the metrics are not necessary in boxplot
df_2_refine = df2.drop(df2.tail(7).index)
df_2_refine = df_2_refine.sort_index(axis=1)
boxplot1 = df_2_refine.boxplot(grid=False, rot=90, showmeans = True)
boxplot1.figure.savefig('Art_freeform_AP_03-boxplot.pdf')
boxplot1
df_corr = df2.copy()
df_corr.loc[ df_corr.voter.str.match('voter_[0-9]+'), 'voter' ] = 'human'
df_corr.loc[ df_corr.voter == 'voter_chamfer', 'voter' ] = 'chamfer'
df_corr
df_corr_agg = df_corr.groupby('voter').mean().T
df_corr_agg.corr()
# print( "Correlation coefficient, p value" )
# print( pearsonr( df_corr_agg.chamfer, df_corr_agg.human ) )
corr_pvalues( df_corr_agg )
df_corr_agg = df_corr.groupby('voter').mean().drop(columns = ["ALG: MasteringSketching"]).T
Correlation is not defined (NaN) for constant data. F-score with a very high threshold (36/1000) find perfect similarity (a constant 1) in all cases except MasteringSketching. With MasteringSketching excluded, correlation is no longer defined.
df_corr_agg.corr()
corr_pvalues( df_corr_agg )
df3, df4, name2, header2 = ready_data(file2)
print("The boxplot of sketch %s"%name2)
df_4_refine = df4.drop(df4.tail(7).index)
df_4_refine = df_4_refine.sort_index(axis=1)
boxplot2 = df_4_refine.boxplot(grid=False, rot=90, showmeans = True)
boxplot2.figure.savefig('Ind_architecture_TU_02-boxplot.pdf')
boxplot2
df_corr = df4.copy()
df_corr.loc[ df_corr.voter.str.match('voter_[0-9]+'), 'voter' ] = 'human'
df_corr.loc[ df_corr.voter == 'voter_chamfer', 'voter' ] = 'chamfer'
df_corr
df_corr_agg = df_corr.groupby('voter').mean().T
df_corr_agg.corr()
corr_pvalues( df_corr_agg )
print( pearsonr( df_corr_agg.chamfer, df_corr_agg.human ) )
df_corr_agg = df_corr.groupby('voter').mean().drop(columns = ["ALG: MasteringSketching"]).T
df_corr_agg.corr()
corr_pvalues( df_corr_agg )
A low p-value means that the two rankings are the same.
df_ind = df4[ df4.voter.str.match('voter_[0-9]+') ].drop( columns=['voter'] ).mean()
df_ind
df_art = df2[ df4.voter.str.match('voter_[0-9]+') ].drop( columns=['voter'] ).mean()
df_art
likert_averages = pd.DataFrame({'Art_freeform_AP_03': df_art, 'Ind_architecture_TU_02': df_ind })
likert_averages
import scipy.stats
scipy.stats.wilcoxon( likert_averages.Art_freeform_AP_03, likert_averages.Ind_architecture_TU_02 )
# scipy.stats.wilcoxon( likert_averages.Art_freeform_AP_03, likert_averages.Art_freeform_AP_03 + 0.01 )